home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Tech Arsenal 1
/
Tech Arsenal (Arsenal Computer).ISO
/
tek-04
/
bipl.zip
/
PROCS.ZIP
/
REGEXP.ICN
< prev
next >
Wrap
Text File
|
1992-09-28
|
25KB
|
761 lines
############################################################################
#
# File: regexp.icn
#
# Subject: Procedures for regular expression pattern matching
#
# Author: Robert J. Alexander
#
# Date: November 8, 1991
#
###########################################################################
#
# This is a kit of procedures to deal with UNIX-like regular expression
# patterns.
#
# These procedures are interesting partly because of the "recursive
# suspension" (or "suspensive recursion" :-) technique used to simulate
# conjunction of an arbitrary number of computed expressions (see
# notes, below).
#
#
# The public procedures are:
#
# ReMatch(pattern,s,i1,i2) : i3,i4,...,iN
# ReFind(pattern,s,i1,i2) : i3,i4,...,iN
# RePat(s) : pattern list
#
#
# ReMatch() produces the sequence of positions in "s" past a substring
# starting at "i1" that matches "pattern", but fails if there is no
# such position. Similar to match(), but is capable of generating
# multiple positions.
#
# ReFind() produces the sequence of positions in "s" where substrings
# begin that match "pattern", but fails if there is no such position.
# Similar to find(). Each position is produced only once, even if
# several possible matches are possible at that position.
#
# "pattern" can be either a string or a pattern list -- see RePat(),
# below.
#
# Default values of s, i1, and i2 are handled as for Icon's built-in
# string scanning procedures such as match().
#
#
# RePat(s) : L
#
# Creates a pattern element list from pattern string "s", but fails if
# the pattern string is not syntactically correct. ReMatch() and
# ReFind() will automatically convert a pattern string to a pattern
# list, but it is faster to do the conversion explicitly if multiple
# operations are done using the same pattern. An additional advantage
# to compiling the pattern separately is avoiding ambiguity of failure
# caused by an incorrect pattern and failure to match a correct pattern.
#
#
# Accessible Global Variables
#
# After a match, the strings matched by parenthesized regular
# expressions are left in list "Re_ParenGroups", and can be accessed by
# subscripting in using the same number as the \N construct.
#
# If it is desired that regular expression format be similar to UNIX
# filename generation patterns but still retain the power of full
# regular expressions, make the following assignments prior to
# compiling the pattern string:
#
# Re_ArbString := "*" # Defaults to ".*"
# Re_AnyString := "?" # Defaults to "."
#
# The sets of characters (csets) that define a word, digits, and white
# space can be modified. The following assignments can be made before
# compiling the pattern string. The character sets are captured when
# the pattern is compiled, so changing them after pattern compilation
# will not alter the behavior of matches unless the pattern string is
# recompiled.
#
# Re_WordChars := 'whatever you like'
# # Defaults to &letters ++ &digits ++ "_"
# Re_Digits := &digits ++ 'ABCDEFabcdef'
# # Defaults to &digits
# Re_Space := 'whatever you like'
# # Defaults to ' \t\v\n\r\f'
#
# These globals are normally not initialized until the first call to
# RePat(), and then only if they are null. They can be explicitly
# initialized to their defaults (if they are null) by calling
# Re_Default().
#
# Characters compiled into patterns can be passed through a
# user-supplied filter procedure, provided in global variable
# Re_Filter. The filtering is done before the characters are bound
# into the pattern. The filter proc is passed one argument, the string
# to filter, and it must return the filtered string as its result. If
# the filter proc fails, the string will be used unfiltered. The
# filter proc is called with an argument of either type string (for
# characters in the pattern) or cset (for character classes [...]). A
# typical use for this facility is to implement case-independent
# matching. All pattern characters can downshifted by assigning
#
# Re_Filter := map
#
# Filtering is done only as the pattern is compiled. Filtering of
# strings to be matched must be explicitly done. Therefore,
# case-independent matching will occur only if map() is applied to all
# strings to be matched.
#
# In the case of patterns containing alternation, ReFind() will
# generally not produce positions in increasing order, but will produce
# all positions from the first term of the alternation (in increasing
# order) followed by all positions from the second (in increasing
# order). If it is necessary that the positions be generated in
# strictly increasing order, with no duplicates, assign any non-null
# value to Re_Ordered:
#
# Re_Ordered := 1
#
# If the Re_Ordered options is chosen, there is a *small* penalty in
# efficiency in some cases, and the co-expression facility is required
# in your Icon implementation. Example:
#
#
# Regular Expression Characters and Features Supported
#
# The regular expression format supported by procedures in this file
# model very closely those supported by the UNIX "egrep" program, with
# modifications as in the Perl programming language definition.
# Following is a brief description of the special characters used in
# regular expressions. In the description, the abbreviation RE means
# regular expression.
#
# c An ordinary character (not one of the special characters
# discussed below) is a one-character RE that matches that
# character.
#
# \c A backslash followed by any special character is a one-
# character RE that matches the special character itself.
#
# Note that backslash escape sequences representing
# non-graphic characters are not supported directly
# by these procedures. Of course, strings coded in an
# Icon program will have such escapes handled by the
# Icon translator. If such escapes must be supported
# in strings read from the run-time environment (e.g.
# files), they will have to be converted by other means,
# such as the Icon Program Library procedure "escape()".
#
# . A period is a one-character RE that matches any
# character.
#
# [string] A non-empty string enclosed in square brackets is a one-
# character RE that matches any *one* character of that
# string. If, the first character is "^" (circumflex),
# the RE matches any character not in the remaining
# characters of the string. The "-" (minus), when between
# two other characters, may be used to indicate a range of
# consecutive ASCII characters (e.g. [0-9] is equivalent to
# [0123456789]). Other special characters stand for
# themselves in a bracketed string.
#
# * Matches zero or more occurrences of the RE to its left.
#
# + Matches one or more occurrences of the RE to its left.
#
# ? Matches zero or one occurrences of the RE to its left.
#
# {N} Matches exactly N occurrences of the RE to its left.
#
# {N,} Matches at least N occurrences of the RE to its left.
#
# {N,M} Matches at least N occurrences but at most M occurrences
# of the RE to its left.
#
# ^ A caret at the beginning of an entire RE constrains
# that RE to match an initial substring of the subject
# string.
#
# $ A currency symbol at the end of an entire RE constrains
# that RE to match a final substring of the subject string.
#
# | Alternation: two REs separated by "|" match either a
# match for the first or a match for the second.
#
# () A RE enclosed in parentheses matches a match for the
# regular expression (parenthesized groups are used
# for grouping, and for accessing the matched string
# subsequently in the match using the \N expression).
#
# \N Where N is a digit in the range 1-9, matches the same
# string of characters as was matched by a parenthesized
# RE to the left in the same RE. The sub-expression
# specified is that beginning with the Nth occurrence
# of "(" counting from the left. E.g., ^(.*)\1$ matches
# a string consisting of two consecutive occurrences of
# the same string.
#
# Perl Extensions
#
# The following extensions to UNIX REs, as specified in the Perl
# programming language, are supported.
#
# \w Matches any alphanumeric (including "_").
# \W Matches any non-alphanumeric.
#
# \b Matches only at a word-boundary (word defined as a string
# of alphanumerics as in \w).
# \B Matches only non-word-boundaries.
#
# \s Matches any white-space character.
# \S Matches any non-white-space character.
#
# \d Matches any digit [0-9].
# \D Matches any non-digit.
#
# \w, \W, \s, \S, \d, \D can be used within [string] REs.
#
#
# Note on Details of Matching
#
# The method of matching differs a bit from UNIX-style regular
# expressions -- particularly where closures are concerned ("*", "+",
# "{}", "?"). UNIX regular expressions are documented to match the
# "longest, leftmost" strings in cases where a choice is needed. The
# procedures in this file are capable of generating all possible
# matches of the pattern, and generate the possibilities by matching
# the fewest first ("shortest, leftmost"). Matching of the various
# pattern elements is performed exactly as though it were an Icon
# conjunction of the pattern elements.
#
#
# Notes on computed conjunction expressions by "suspensive recursion"
#
# A conjunction expression of an arbitrary number of terms can be
# computed in a looping fashion by the following recursive technique:
#
# procedure Conjunct(v)
# if <there is another term to be appended to the conjunction> then
# suspend Conjunct(<the next term expression>)
# else
# suspend v
# end
#
# The argument "v" is needed for producing the value of the last term
# as the value of the conjunction expression, accurately modeling Icon
# conjunction. If the value of the conjunction is not needed, the
# technique can be slightly simplified by eliminating "v":
#
# procedure ConjunctAndProduceNull()
# if <there is another term to be appended to the conjunction> then
# suspend ConjunctAndProduceNull(<the next term expression>)
# else
# suspend
# end
#
# Note that <the next term expression> must still remain in the suspend
# expression to test for failure of the term, although its value is not
# passed to the recursive invocation, This could have been coded as
#
# suspend <the next term expression> & ConjunctAndProduceNull()
#
# but wouldn't have been as provocative.
#
# Since the computed conjunctions in this program are evaluated only for
# their side effects, the second technique is used in two situations:
#
# (1) To compute the conjunction of all of the elements in the
# regular expression pattern list (Re_match1()).
#
# (2) To evaluate the "exactly N times" and "N to M times"
# control operations (Re_NTimes()).
#
############################################################################
record Re_Tok(proc,args)
global Re_ParenGroups,Re_Filter,Re_Ordered
global Re_WordChars,Re_NonWordChars
global Re_Space,Re_NonSpace
global Re_Digits,Re_NonDigits
global Re_ArbString,Re_AnyString
global Re_TabMatch
################### Pattern Translation Procedures ###################
procedure RePat(s) # L
#
# Produce pattern list representing pattern string s.
#
#
# Create a list of pattern elements. Pattern strings are parsed
# and converted into list elements as shown in the following table.
# Since some list elements reference other pattern lists, the
# structure is really a tree.
#
# Token Generates Matches...
# ----- --------- ----------
# ^ Re_Tok(pos,[1]) Start of string or line
# $ Re_Tok(pos,[0]) End of string or line
# . Re_Tok(move,[1]) Any single character
# + Re_Tok(Re_OneOrMore,[tok]) At least one occurrence of
# previous token
# * Re_Tok(Re_ArbNo,[tok]) Zero or more occurrences of
# previous token
# | Re_Tok(Re_Alt,[pattern,pattern]) Either of prior expression
# or next expression
# [...] Re_Tok(Re_TabAny,[cset]) Any single character in
# specified set (see below)
# (...) Re_Tok(Re_MatchReg,[pattern]) Parenthesized pattern as
# single token
# <string of non-special characters> The string of no-special
# Re_Tok(Re+TabMatch,string) characters
# \b Re_Tok(Re_WordBoundary,[Re_WordChars,Re_NonWordChars])
# A word-boundary
# (word default: [A-Za-z0-9_]+)
# \B Re_Tok(Re_NonWordBoundary,[Re_WordChars,Re_NonWordChars])
# A non-word-boundary
# \w Re_Tok(Re_TabAny,[Re_WordChars])A word-character
# \W Re_Tok(Re_TabAny,[Re_NonWordChars]) A non-word-character
# \s Re_Tok(Re_TabAny,[Re_Space]) A space-character
# \S Re_Tok(Re_TabAny,[Re_NonSpace]) A non-space-character
# \d Re_Tok(Re_TabAny,[Re_Digits]) A digit
# \D Re_Tok(Re_TabAny,[Re_NonDigits]) A non-digit
# {n,m} Re_Tok(Re_NToMTimes,[tok,n,m]) n to m occurrences of
# previous token
# {n,} Re_Tok(Re_NOrMoreTimes,[tok,n]) n or more occurrences of
# previous token
# {n} Re_Tok(Re_NTimes,[tok,n]) exactly n occurrences of
# previous token
# ? Re_Tok(Re_ZeroOrOneTimes,[tok]) one or zero occurrences of
# previous token
# \<digit> Re_Tok(Re_MatchParenGroup,[n]) The string matched by
# parenthesis group <digit>
#
local plist
#
# Initialize.
#
initial Re_Default()
Re_WordChars := cset(Re_WordChars)
Re_NonWordChars := ~Re_WordChars
Re_Space := cset(Re_Space)
Re_NonSpace := ~Re_Space
Re_Digits := cset(Re_Digits)
Re_NonDigits := ~Re_Digits
s ? (plist := Re_pat1(0)) | fail
return plist
end
procedure Re_pat1(level) # L
#
# Recursive portion of RePat()
#
local plist,n,m,c,comma
static none,parenNbr
initial {
Re_TabMatch := proc("=",1)
none := []
}
if level = 0 then parenNbr := 0
plist := []
#
# Loop to put pattern elements on list.
#
until pos(0) do {
(="|",plist := [Re_Tok(Re_Alt,[plist,Re_pat1(level + 1) | fail])]) |
put(plist,
## (="^",*plist = 0 | plist[-1].proc === Re_Alt,Re_Tok(pos,[1])) |
(="^",pos(2) | &subject[-2] == ("|" | "("),Re_Tok(pos,[1])) |
(="$",pos(0) | match("|" | ")"),Re_Tok(pos,[0])) |
(match(")"),level > 0,break) |
(=Re_ArbString,Re_Tok(Re_Arb,none)) |
(=Re_AnyString,Re_Tok(move,[1])) |
(="+",Re_Tok(Re_OneOrMore,[Re_prevTok(plist) | fail])) |
(="*",Re_Tok(Re_ArbNo,[Re_prevTok(plist) | fail])) |
1(Re_Tok(Re_TabAny,[c := Re_cset()]),\c | fail) |
3(="(",n := parenNbr +:= 1,
Re_Tok(Re_MatchReg,[Re_pat1(level + 1) | fail,n]),
move(1) | fail) |
(="\\b",Re_Tok(Re_WordBoundary,[Re_WordChars,Re_NonWordChars])) |
(="\\B",Re_Tok(Re_NonWordBoundary,[Re_WordChars,Re_NonWordChars])) |
(="\\w",Re_Tok(Re_TabAny,[Re_WordChars])) |
(="\\W",Re_Tok(Re_TabAny,[Re_NonWordChars])) |
(="\\s",Re_Tok(Re_TabAny,[Re_Space])) |
(="\\S",Re_Tok(Re_TabAny,[Re_NonSpace])) |
(="\\d",Re_Tok(Re_TabAny,[Re_Digits])) |
(="\\D",Re_Tok(Re_TabAny,[Re_NonDigits])) |
(="{",(n := tab(many(&digits)),comma := =(",") | &null,
m := tab(many(&digits)) | &null,="}") | fail,
if \m then Re_Tok(Re_NToMTimes,
[Re_prevTok(plist),integer(n),integer(m)])
else if \comma then Re_Tok(Re_NOrMoreTimes,
[Re_prevTok(plist),integer(n)])
else Re_Tok(Re_NTimes,[Re_prevTok(plist),integer(n)])) |
(="?",Re_Tok(Re_ZeroOrOneTimes,[Re_prevTok(plist) | fail])) |
Re_Tok(Re_TabMatch,[Re_string(level)]) |
(="\\",n := tab(any(&digits)),Re_Tok(Re_MatchParenGroup,[integer(n)]))
) |
fail
}
return plist
end
procedure Re_prevTok(plist)
#
# Pull previous token from the pattern list. This procedure must take
# into account the fact that successive character tokens have been
# optimized into a single string token.
#
local lastTok,s,r
lastTok := pull(plist) | fail
if lastTok.proc === Re_TabMatch then {
s := lastTok.args[1]
r := Re_Tok(Re_TabMatch,[s[-1]])
s[-1] := ""
if *s > 0 then {
put(plist,lastTok)
lastTok.args[1] := s
}
return r
}
return lastTok
end
procedure Re_Default()
#
# Assign default values to regular expression translation globals, but
# only to variables whose values are null.
#
/Re_WordChars := &letters ++ &digits ++ "_"
/Re_Space := ' \t\v\n\r\f'
/Re_Digits := &digits
/Re_ArbString := ".*"
/Re_AnyString := "."
return
end
procedure Re_cset()
#
# Matches a [...] construct and returns a cset.
#
local complement,c,e,ch,chars
="[" | fail
(complement := ="^" | &null,
(e := (="-" | "")) || (c := move(1) || tab(find("]"))),move(1)) |
return &null
c ? {
while chars := tab(upto('-\\')) do {
e ++:= case move(1) of {
"-": chars[1:-1] ++
&cset[ord(chars[-1]) + 1:ord(move(1)) + 2] | return &null
"\\": case ch := move(1) of {
"w": Re_WordChars
"W": Re_NonWordChars
"s": Re_Space
"S": Re_NonSpace
"d": Re_Digits
"D": Re_NonDigits
default: ch
}
}
}
e ++:= tab(0)
if \complement then e := ~e
}
e := (\Re_Filter)(e)
return cset(e)
end
procedure Re_string(level)
#
# Matches a string of non-special characters, returning a string.
#
local special,s,p
static nondigits
initial nondigits := ~&digits
special := if level = 0 then '\\.+*|[({?' else '\\.+*|[({?)'
s := tab(upto(special) | 0)
while ="\\" do {
p := &pos
if tab(any('wWbBsSdD')) |
(tab(any('123456789')) & (pos(0) | any(nondigits))) then {
tab(p - 1)
break
}
s ||:= move(1) || tab(upto(special) | 0)
}
if pos(0) & s[-1] == "$" then {
move(-1)
s[-1] := ""
}
s := string((\Re_Filter)(s))
return "" ~== s
end
##################### Matching Engine Procedures ########################
procedure ReMatch(plist,s,i1,i2) # i3,i4,...,iN
#
# Produce the sequence of positions in s past a string starting at i1
# that matches the pattern plist, but fails if there is no such
# position. Similar to match(), but is capable of generating multiple
# positions.
#
if type(plist) ~== "list" then plist := RePat(plist) | fail
/i1:= if /s := &subject then &pos else 1 ; /i2 := 0
Re_ParenGroups := []
suspend s[i1:i2] ? (Re_match1(plist,1),i1 + &pos - 1)
end
procedure Re_match1(plist,i) # s1,s2,...,sN
#
# Used privately by ReMatch() to simulate a computed conjunction
# expression via recursive generation.
#
local tok
suspend if tok := plist[i] then
Re_tok_match(tok,plist,i) & Re_match1(plist,i + 1) else &null
end
procedure ReFind(plist,s,i1,i2) # i3,i4,...,iN
#
# Produce the sequence of positions in s where strings begin that match
# the pattern plist, but fails if there is no such position. Similar
# to find().
#
local p
if type(plist) ~== "list" then plist := RePat(plist) | fail
/i1 := if /s := &subject then &pos else 1 ; /i2 := 0
s[i1:i2] ? suspend (
tab(Re_skip(plist,1)) &
p := &pos &
Re_match1(plist,1)\1 &
i1 + p - 1)
end
procedure Re_tok_match(tok,plist,i)
#
# Match a single token. Can be recursively called by the token
# procedure.
#
local prc
prc := tok.proc
suspend (
if prc === Re_Arb then Re_Arb(plist,i)
else suspend prc!tok.args
)
end
########## Heuristic Code for Matching Arbitrary Characters ##########
procedure Re_skip(plist,i) # s1,s2,...,sN
#
# Used privately -- match a sequence of strings in s past which a match
# of the first pattern element in plist is likely to succeed. This
# procedure is used for heuristic performance improvement by ReMatch()
# for the ".*" pattern element, and by ReFind().
#
local x,s,p,prc
x := plist[i]
suspend case prc := (\x).proc | &null of {
Re_TabMatch: find!x.args
Re_TabAny: upto!x.args
pos: x.args[1]
## Re_WordBoundary: Re_WordBoundaries!x.args
Re_WordBoundary |
Re_NonWordBoundary:
p := &pos & tab(Re_skip(plist,i + 1)) & prc!x.args & untab(p)
Re_OneOrMore |
Re_MatchParenGroup: if s := (\Re_ParenGroups)[x.args[1]] then
find(s) else &pos to *&subject + 1
Re_NToMTimes |
Re_NOrMoreTimes |
Re_NTimes:
if x.args[2] > 0 then Re_skip(x.args[1],1) else &pos to &subject + 1
Re_MatchReg: Re_skip(x.args[1],1)
Re_Alt:
if \Re_Ordered then
Re_result_merge{Re_skip(x.args[1],1),Re_skip(x.args[2],1)}
else
Re_skip(x.args[1 | 2],1)
default: &pos to *&subject + 1
}
end
procedure Re_result_merge(L)
#
# Programmer-defined control operation to merge the result sequences of
# two integer-producing generators. Both generators must produce their
# result sequences in numerically increasing order with no duplicates,
# and the output sequence will be in increasing order with no
# duplicates.
#
local e1,e2,r1,r2
e1 := L[1] ; e2 := L[2]
r1 := @e1 ; r2 := @e2
while \(r1 | r2) do
if /r2 | \r1 < r2 then
suspend r1 do r1 := @e1 | &null
else if /r1 | r1 > r2 then
suspend r2 do r2 := @e2 | &null
else
r2 := @e2 | &null
end
procedure untab(origPos)
#
# Converts a string scanning expression that moves the cursor to one
# that produces a cursor position and doesn't move the cursor (converts
# something like tab(find(x)) to find(x). The template for using this
# procedure is
#
# origPos := &pos ; tab(x) & ... & untab(origPos)
#
local newPos
newPos := &pos
tab(origPos)
suspend newPos
tab(newPos)
end
####################### Matching Procedures #######################
procedure Re_Arb(plist,i)
#
# Match arbitrary characters (.*)
#
suspend tab(if \plist then Re_skip(plist,i + 1) else 1 to *&subject + 1)
end
procedure Re_TabAny(C)
#
# Match a character of a character set ([...],\w,\W,\s,\S,\d,\D
#)
suspend tab(any(C))
end
procedure Re_MatchReg(tokList,groupNbr)
#
# Match parenthesized group and assign matched string to list Re_ParenGroup
#
local p,s
p := &pos
/Re_ParenGroups := []
every Re_match1(tokList,1) do {
while *Re_ParenGroups < groupNbr do put(Re_ParenGroups)
s := &subject[p:&pos]
Re_ParenGroups[groupNbr] := s
suspend s
}
Re_ParenGroups[groupNbr] := &null
end
procedure Re_WordBoundary(wd,nonwd)
#
# Match word-boundary (\b)
#
suspend ((pos(1),any(wd)) | (pos(0),move(-1),tab(any(wd))) | (move(-1),
(tab(any(wd)),any(nonwd)) | (tab(any(nonwd)),any(wd))),"")
end
procedure Re_NonWordBoundary(wd,nonwd)
#
# Match non-word-boundary (\B)
#
suspend ((pos(1),any(nonwd)) | (pos(0),move(-1),tab(any(nonwd))) | (move(-1),
(tab(any(wd)),any(wd)) | (tab(any(nonwd)),any(nonwd)),""))
end
procedure Re_MatchParenGroup(n)
#
# Match same string matched by previous parenthesized group (\N)
#
local s
suspend if s := \Re_ParenGroups[n] then =s else ""
end
################### Control Operation Procedures ###################
procedure Re_ArbNo(tok)
#
# Match any number of times (*)
#
suspend "" | (Re_tok_match(tok) & Re_ArbNo(tok))
end
procedure Re_OneOrMore(tok)
#
# Match one or more times (+)
#
suspend Re_tok_match(tok) & Re_ArbNo(tok)
end
procedure Re_NToMTimes(tok,n,m)
#
# Match n to m times ({n,m}
#
suspend Re_NTimes(tok,n) & Re_ArbNo(tok)\(m - n + 1)
end
procedure Re_NOrMoreTimes(tok,n)
#
# Match n or more times ({n,})
#
suspend Re_NTimes(tok,n) & Re_ArbNo(tok)
end
procedure Re_NTimes(tok,n)
#
# Match exactly n times ({n})
#
if n > 0 then
suspend Re_tok_match(tok) & Re_NTimes(tok,n - 1)
else suspend
end
procedure Re_ZeroOrOneTimes(tok)
#
# Match zero or one times (?)
#
suspend "" | Re_tok_match(tok)
end
procedure Re_Alt(tokList1,tokList2)
#
# Alternation (|)
#
suspend Re_match1(tokList1 | tokList2,1)
end